In this piece of paper, a set of data obtained from Annual Status of Education Report (ASER) is explored. The raw data was downloaded from the link here. https://palnetwork.org/aser-centre/
library(tidyverse)
library(ggplot2)
library(gghighlight)
library(stringr)
library(ggmap)
library(dplyr)
library(cartography)
library(sf)
library(tmap)
library(spData)
library(maps)
library(mapdata)
library(maptools)
library(ggthemes)
library(choroplethr)
library(choroplethrAdmin1)
library(choroplethrMaps)
library(rgdal)
# library(choroplethrZip)
# library(spDataLarge)
# install.packages("spDataLarge") not available
read.csv("aser/ASER2016GSchool.csv")
school <- read.csv("aser/ASER2016GSchool.csv")
child <- read.csv("aser/ASER2016Child.csv")
RegionName <- c("2" = "Panjab",
"3" = "Sindh",
"4" = "Balochistan",
"5" = "Khyber Pakhtunkhwa",
"6" = "Gilgit-Baltistan",
"7" = "Azad Jammu and Kashmir",
"8" = "Islamabad - ICT",
"9" = "Federally Administrated Tribal Areas")
Gender <- c("0" = "Male",
"-1" = "Female")
length(unique(child$CID))
## [1] 255196
The whole samplesize (the numebr of children) of this dataset is 255196.
child %>%
filter(DID == 266) %>%
summarize(N_hunza = length(unique(CID)))
The samplesize of Hunza alone is 1641.
child %>%
filter(DID == 266) %>%
summarize(gender_proportion = mean(C002))
-1: female, 0: male
gender_proportion = -0.5173675 means there are a little more girls in the dataset.
child %>%
filter(DID == 266) %>%
ggplot(aes(C001)) +
geom_histogram()
Age is well sparsed
child %>%
filter(DID == 266) %>%
ggplot(aes(C003)) +
geom_histogram(bins = 3)
1 = never enrolled; 2 = drop-out; 3 = currently enrolled
child %>%
filter(DID == 266) %>%
ggplot(aes(C003)) +
geom_histogram(bins = 3, binwidth = 1) +
facet_grid(~C002, labeller = labeller(C002 = Gender))
Both genders look pretty good interms of the absolute number of currently-enrolled-children
child %>%
filter(DID == 266) %>%
group_by(C002) %>%
summarize(enrollment_rate = mean(C003 == 3)) %>%
ungroup() %>%
ggplot(aes(C002, enrollment_rate)) +
geom_col() +
scale_y_continuous() +
geom_label(aes(label = enrollment_rate)) +
scale_x_continuous(breaks = c(-1, 0), labels = c("Female", "Male"))
As a rate, both are doing pretty good
1 = Government; 2 = Private; 3 = Madrasah(Conventional religious education) School; 4 = Other(Non formal education facility)
child %>%
filter(DID == 266) %>%
ggplot(aes(C006)) +
geom_histogram()
Most children go to public schools or private schools
child %>%
filter(RID == 6) %>%
group_by(DID) %>%
mutate(Current_Enrollment_Rate = mean(C003 == 3)) %>%
ggplot(aes(DID, Current_Enrollment_Rate)) +
geom_count() +
scale_x_continuous(breaks = 260:266, labels = c("Gilgit", "Diamer", "Skardu", "Ghanshe", "Astore", "Ghizer", "Hunza-Nagar"))
Within Gilgit-Baltistan, Hunza is outperforming.
child %>%
filter(RID == 6) %>%
group_by(DID) %>%
ggplot(aes(DID, C010)) +
geom_boxplot(aes(group = DID)) +
scale_x_continuous(breaks = 260:266, labels = c("Gilgit", "Diamer", "Skardu", "Ghanshe", "Astore", "Ghizer", "Hunza-Nagar"))
1 = Begginer/Nothing; 2 = Letters; 3 = Words; 4 = Sentences; 5 = Story
child %>%
filter(DID == 266) %>%
ggplot(aes(C010)) +
geom_histogram()
child %>%
filter(DID == 266) %>%
summarize(na = sum(is.na(C010)))
child %>%
filter(DID == 266, C013 != c(3,4)) %>%
ggplot(aes(C010)) +
geom_histogram() +
facet_grid(~C001)
# children at he age of 3 and 4 are removed for they have not data
child %>%
filter(DID == 266) %>%
ggplot(aes(C010)) +
geom_histogram() +
facet_grid(~C002, labeller = labeller(C002 = Gender))
child %>%
filter(DID == 266, C013 != c(3,4)) %>%
ggplot(aes(C013)) +
geom_histogram() +
facet_grid(~C001)
# children at he age of 3 and 4 are removed for they have not data
child %>%
filter(DID == 266) %>%
ggplot(aes(C013)) +
geom_histogram() +
facet_grid(~C002, labeller = labeller(C002 = Gender))
child %>%
group_by(DID) %>%
mutate(avg = round(mean(C003 == 3), digits = 2)) %>%
ungroup() %>%
ggplot(aes(avg)) +
geom_histogram() +
facet_grid(~RID, labeller = labeller(RID = RegionName)) +
labs(title = "Current Enrollment Rate by Region")
child %>%
filter(C002 == -1) %>%
group_by(DID) %>%
mutate(avg_learning = mean(C010, na.rm = TRUE)) %>%
ggplot(aes(DID, avg_learning, color = RID)) +
geom_point() +
geom_text(aes(label = DID), nudge_x = 5, check_overlap = TRUE)
child %>%
filter(C002 == -1) %>%
group_by(DID) %>%
mutate(avg_learning = mean(C010, na.rm = TRUE)) %>%
ggplot(aes(DID, avg_learning, color = RID)) +
geom_point() +
geom_text(aes(label = DID), nudge_x = 5, check_overlap = TRUE) +
gghighlight(RID == 6)
It is interesting to note that Gilgit-Baltistan(RID==6) has a huge diversity in average learning levels of girls and Hunza(DID==266) is in the top group of all region.
data("admin1.map")
pak <- subset(admin1.map, admin == "pakistan")
region <- pak$region
ggplot() +
geom_polygon(data = pak, aes(long, lat, group = group),
fill = "white", color = "black") +
geom_point(data = map, aes(long, lat, label = name)) +
geom_text(data = map, aes(long, lat, label = name), check_overlap = TRUE, nudge_y = 1) +
coord_fixed()
ica_df_3 <- ica_df %>% filter(Province == "sindh")
ica_df_3$Districts <- ica_df_3$Districts %>%
str_replace("ghotki", "gotki") %>%
str_replace("mirpur khas", "mirpurkhas") %>%
str_replace("malir karachi", "karachi-malir-rural") %>%
str_replace("naushahro feroze", "nowshero feroze") %>%
str_replace("kambar shahdad kot", "qambar shahdadkot") %>%
str_replace("sujawal", "sajawal") %>%
str_replace("shaheed benazir abad", "shaheed benazirabad") %>%
str_replace("tando allahyar", "tando allah yar") %>%
as.vector()
child_dname_3 <- child_dname %>% filter(RNAME == "Sindh") %>% left_join(ica_df_3, by = c("dname" = "Districts"))
child_dname_3 %>% group_by(dname) %>% summarize(n = sum(x))
## `summarise()` ungrouping output (override with `.groups` argument)
ica_df_3
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = child_ica, aes(x, y, label = C003, color = C003)) +
geom_text(data = child_ica, aes(x, y, label = C003), check_overlap = TRUE, nudge_y = 1)
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 8541 rows containing missing values (geom_point).
## Warning: Removed 8541 rows containing missing values (geom_text).
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = child_ica, aes(x, y, label = C010, color = C010))
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 8541 rows containing missing values (geom_point).
# geom_text(data = child_ica, aes(x, y, label = C010), check_overlap = TRUE, nudge_y = 1)
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = child_ica %>% group_by(DID) %>%
mutate(gender_ratio = mean(C002)), aes(x, y, label = gender_ratio, color = gender_ratio))
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 8541 rows containing missing values (geom_point).
0: male, -1: female
Thus, -0.5 indicates the complete gender parity
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = child_ica %>% group_by(DID) %>%
mutate(gender_ratio = mean(C002, na.rm = TRUE)), aes(x, y, label = gender_ratio, color = gender_ratio, size = -gender_ratio))
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 8541 rows containing missing values (geom_point).
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = (child_ica %>% filter(C001 == 14:16) %>% group_by(DID) %>%
mutate(avg_learning_level = mean(C010, na.rm = TRUE))), aes(x, y, label = avg_learning_level, color = avg_learning_level, size = avg_learning_level))
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning in C001 == 14:16: 長いオブジェクトの長さが短いオブジェクトの長さの倍数に
## なっていません
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 385 rows containing missing values (geom_point).
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = (child_ica %>% filter(C001 == 12:16) %>% group_by(DID) %>%
mutate(gender_ratio = mean(C002, na.rm = TRUE), avg_learning_level = mean(C010, na.rm = TRUE))), aes(x, y, label = gender_ratio, color = gender_ratio, size = avg_learning_level)) +
theme(axis.title = element_text())
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning in C001 == 12:16: 長いオブジェクトの長さが短いオブジェクトの長さの倍数に
## なっていません
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 449 rows containing missing values (geom_point).